suppressPackageStartupMessages({
library(data.table)
library(DESeq2)
library(gplots)
library(here)
library(hyperSpec)
library(RColorBrewer)
library(tidyverse)
library(VennDiagram)
library(vsn)
})
suppressMessages({
source(here("UPSCb-common/src/R/featureSelection.R"))
source(here("UPSCb-common/src/R/volcanoPlot.R"))
source(here("UPSCb-common/src/R/gopher.R"))
})
pal=brewer.pal(8,"Dark2")
hpal <- colorRampPalette(c("blue","white","red"))(100)
mar <- par("mar")
TODO remember to add a function to check for correlation between logFC and transcript length - check for the effective length * Functions 1. plot specific transcript expression
"line_plot" <- function(dds=dds,vst=vst,gene_id=gene_id,gene_name=character(0)){
message(paste("Plotting",gene_id))
sel <- grepl(gene_id,rownames(vst))
stopifnot(sum(sel)==1)
d <- bind_cols(as.data.frame(colData(dds)),
data.frame(value=vst[sel,]))
p <- ggplot(d,
aes(x=TIME,y=value,col=TREATMENT)) +
facet_wrap(.~TISSUE) +
geom_point() +
scale_y_continuous(name="VST expression") +
ggtitle(label=paste("Expression for: ",gene_id,ifelse(length(gene_name)>0,gene_name,"")))
suppressMessages(suppressWarnings(plot(p)))
p <- ggplot(d,
aes(x=CONDITION,y=value,col=TISSUE,group=TISSUE)) +
geom_point() + geom_smooth() +
scale_y_continuous(name="VST expression") +
ggtitle(label=paste("Expression for: ",gene_id,ifelse(length(gene_name)>0,gene_name,"")))
suppressMessages(suppressWarnings(plot(p)))
return(NULL)
}
"extract_results" <- function(dds,vst,contrast,
padj=0.01,lfc=0.5,
plot=TRUE,verbose=TRUE,
export=TRUE,default_dir=here("data/analysis/DE"),
default_prefix="DE-",
labels=colnames(dds),
sample_sel=1:ncol(dds),
expression_cutoff=0,
debug=FALSE){
if(length(contrast)==1){
res <- results(dds,name=contrast)
} else {
res <- results(dds,contrast=contrast)
}
if(plot){
par(mar=c(5,5,5,5))
volcanoPlot(res)
par(mar=mar)
}
# a look at independent filtering
if(verbose){
message(sprintf("The independent filtering cutoff is %s, removing %s of the data",
round(metadata(res)$filterThreshold,digits=5),
names(metadata(res)$filterThreshold)))
}
if(plot){
plot(metadata(res)$filterNumRej,
type="b", ylab="number of rejections",
xlab="quantiles of filter")
lines(metadata(res)$lo.fit, col="red")
abline(v=metadata(res)$filterTheta)
}
if(verbose){
message(sprintf("The independent filtering cutoff is %s, removing %s of the data",
round(metadata(res)$filterThreshold,digits=5),
names(metadata(res)$filterThreshold)))
max.theta <- metadata(res)$filterNumRej[which.max(metadata(res)$filterNumRej$numRej),"theta"]
message(sprintf("The independent filtering maximises for %s %% of the data, corresponding to a base mean expression of %s (library-size normalised read)",
round(max.theta*100,digits=5),
round(quantile(counts(dds,normalized=TRUE),probs=max.theta),digits=5)))
}
if(plot){
qtl.exp=quantile(counts(dds,normalized=TRUE),probs=metadata(res)$filterNumRej$theta)
dat <- data.frame(thetas=metadata(res)$filterNumRej$theta,
qtl.exp=qtl.exp,
number.degs=sapply(lapply(qtl.exp,function(qe){
res$padj <= padj & abs(res$log2FoldChange) >= lfc &
! is.na(res$padj) & res$baseMean >= qe
}),sum))
plot(ggplot(dat,aes(x=thetas,y=qtl.exp)) +
geom_line() + geom_point() +
scale_x_continuous("quantiles of expression") +
scale_y_continuous("base mean expression") +
geom_hline(yintercept=expression_cutoff,
linetype="dotted",col="red"))
if(debug){
p <- ggplot(dat,aes(x=thetas,y=qtl.exp)) +
geom_line() + geom_point() +
scale_x_continuous("quantiles of expression") +
scale_y_log10("base mean expression") +
geom_hline(yintercept=expression_cutoff,
linetype="dotted",col="red")
suppressMessages(suppressWarnings(plot(p)))
plot(ggplot(dat,aes(x=thetas,y=number.degs)) +
geom_line() + geom_point() +
geom_hline(yintercept=dat$number.degs[1],linetype="dashed") +
scale_x_continuous("quantiles of expression") +
scale_y_continuous("Number of DE transcripts"))
plot(ggplot(dat,aes(x=thetas,y=number.degs[1] - number.degs),aes()) +
geom_line() + geom_point() +
scale_x_continuous("quantiles of expression") +
scale_y_continuous("Cumulative number of DE transcripts"))
plot(ggplot(data.frame(x=dat$thetas[-1],
y=diff(dat$number.degs[1] - dat$number.degs)),aes(x,y)) +
geom_line() + geom_point() +
scale_x_continuous("quantiles of expression") +
scale_y_continuous("Number of DE transcripts per interval"))
plot(ggplot(data.frame(x=dat$qtl.exp[-1],
y=diff(dat$number.degs[1] - dat$number.degs)),aes(x,y)) +
geom_line() + geom_point() +
scale_x_continuous("base mean of expression") +
scale_y_continuous("Number of DE transcripts per interval"))
}
p <- ggplot(data.frame(x=dat$qtl.exp[-1],
y=diff(dat$number.degs[1] - dat$number.degs)),aes(x,y)) +
geom_line() + geom_point() +
scale_x_log10("base mean of expression") +
scale_y_continuous("Number of DE transcripts per interval") +
geom_vline(xintercept=expression_cutoff,
linetype="dotted",col="red")
suppressMessages(suppressWarnings(plot(p)))
}
sel <- res$padj <= padj & abs(res$log2FoldChange) >= lfc & ! is.na(res$padj) & res$baseMean >= expression_cutoff
if(verbose){
message(sprintf("There are %s transcripts that are DE with the following parameters: FDR <= %s, |log2FC| >= %s, base mean expression > %s",
sum(sel),
padj,lfc,expression_cutoff))
}
if(export){
if(!dir.exists(default_dir)){
dir.create(default_dir,showWarnings=FALSE,recursive=TRUE,mode="0771")
}
write.csv(res,file=file.path(default_dir,paste0(default_prefix,"results.csv")))
write.csv(res[sel,],file.path(default_dir,paste0(default_prefix,"transcripts.csv")))
}
if(plot){
if(sum(sel) > 1){
heatmap.2(t(scale(t(vst[sel,sample_sel]))),
distfun = pearson.dist,
hclustfun = function(X){hclust(X,method="ward.D2")},
trace="none",col=hpal,labRow = FALSE,
labCol=labels[sample_sel])
} else {
warning("There are not enough DE transcripts to create a heatmap")
}
}
return(list(all=rownames(res[sel,]),
up=rownames(res[sel & res$log2FoldChange > 0,]),
dn=rownames(res[sel & res$log2FoldChange < 0,])))
}
load(here("data/analysis/salmon/dds-transcripts.rda"))
vsd <- varianceStabilizingTransformation(dds,blind=FALSE)
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
## function: y = a/x + b, and a local regression fit was automatically substituted.
## specify fitType='local' or 'mean' to avoid this message next time.
vst <- assay(vsd)
vst <- vst - min(vst)
dir.create(here("data/analysis/DE"),showWarnings=FALSE)
save(vst,file=here("data/analysis/DE/transcript-salmon-vst-aware.rda"))
goi <- read_csv(here("doc/Candidates_and_flowering_genes.csv"))
## Parsed with column specification:
## cols(
## Name = col_character(),
## ID = col_character()
## )
turn them into transcripts
tx <- rownames(vst)[unlist(lapply(goi$ID,grep,rownames(vst)))]
goi <- full_join(tibble(ID=sub("\\.[0-9]+$","",tx),TXID=tx),goi) %>% select(Name,TXID)
## Joining, by = "ID"
stopifnot(all(goi$TXID %in% rownames(vst)))
dev.null <- apply(goi,1,function(ro){line_plot(dds=dds,vst=vst,gene_id = ro[2],gene_name = ro[1])})
## Plotting AT2G47180.1
## Plotting AT1G56600.1
## Plotting AT1G09350.1
## Plotting AT1G09350.2
## Plotting AT1G60470.1
## Plotting AT1G60450.1
## Plotting AT3G28340.1
## Plotting AT1G55740.1
## Plotting AT3G57520.1
## Plotting AT3G57520.2
## Plotting AT3G57520.3
## Plotting AT4G01970.1
## Plotting AT4G01970.2
## Plotting AT5G40390.1
## Plotting AT5G20250.2
## Plotting AT5G20250.1
## Plotting AT5G20250.4
## Plotting AT5G20250.3
## Plotting AT3G06580.1
## Plotting AT1G65480.2
## Plotting AT1G65480.1
## Plotting AT4G20370.1
## Plotting AT1G22770.1
## Plotting AT2G45660.1
## Plotting AT2G45660.2
## Plotting AT4G35900.2
## Plotting AT4G35900.1
## Plotting AT5G15840.1
## Plotting AT5G15840.2
## Plotting AT5G61850.2
## Plotting AT5G61850.1
## Plotting AT1G28130.1
## Plotting AT1G28130.2
## Plotting AT4G27260.1
## Plotting AT5G03840.1
ddsApex <- dds[,dds$TISSUE=="APEX"]
ddsApex <- cbind(ddsApex,ddsApex[,ddsApex$TREATMENT=="Na"])
ddsApex$TREATMENT[ddsApex$TREATMENT=="Na"] <- rep(c("Mock","Dexa"),each=3)
ddsApex$TREATMENT <- droplevels(ddsApex$TREATMENT)
design(ddsApex) <- ~ TIME * TREATMENT
ddsApex <- DESeq(ddsApex)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
## function: y = a/x + b, and a local regression fit was automatically substituted.
## specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
Update the vst for convenience
vsdApex <- varianceStabilizingTransformation(ddsApex,blind=FALSE)
vstApex <- assay(vsdApex)
vstApex <- vstApex - min(vstApex)
meanSdPlot(vstApex)
plotDispEsts(ddsApex)
Check the different contrasts
resultsNames(ddsApex)
## [1] "Intercept" "TIME_T1_vs_T0" "TIME_T3_vs_T0"
## [4] "TREATMENT_Dexa_vs_Mock" "TIMET1.TREATMENTDexa" "TIMET3.TREATMENTDexa"
#’ ```{r res, echo=FALSE,eval=FALSE} CHANGEME - here you need to define the contrast you want to study - see the example in the next block.
The contrast can be given by name, as a list (numerator/denominator) or as a vector of weight (e.g. c(0,1)); read the DESeq2 vignette for more info
The label argument is typically one (or a combination) of the metadata stored in colData
The function allows for looking at the independent filtering results using debug=TRUE
If you are not satisfied with the default from DESeq2, you can set your own cutoff using expression_cutoff
You can change the default output file prefix using default_prefix
You can select the set of samples to be added to the heatmap, using the sample_sel argument. It takes a logical vector.
```
AdexaT1 <- extract_results(ddsApex,vstApex,
contrast = "TIMET1.TREATMENTDexa",
default_prefix = "Salmon-Tx_Apex_Dexa-vs-Mock_T1_",
labels = ddsApex$TREATMENT,
sample_sel = ddsApex$TIME=="T1"
)
## Loading required package: LSD
## The independent filtering cutoff is 9.10619, removing 41.74006% of the data
## The independent filtering cutoff is 9.10619, removing 41.74006% of the data
## The independent filtering maximises for 44.873 % of the data, corresponding to a base mean expression of 8.06645 (library-size normalised read)
## There are 337 transcripts that are DE with the following parameters: FDR <= 0.01, |log2FC| >= 0.5, base mean expression > 0
AdexaT3 <- extract_results(ddsApex,vstApex,
contrast = "TIMET3.TREATMENTDexa",
default_prefix = "Salmon-Tx_Apex_Dexa-vs-Mock_T3_",
labels = ddsApex$TREATMENT,
sample_sel = ddsApex$TIME=="T3")
## The independent filtering cutoff is 13.3026, removing 44.873% of the data
## The independent filtering cutoff is 13.3026, removing 44.873% of the data
## The independent filtering maximises for 51.13887 % of the data, corresponding to a base mean expression of 21.90146 (library-size normalised read)
## There are 403 transcripts that are DE with the following parameters: FDR <= 0.01, |log2FC| >= 0.5, base mean expression > 0
res.list <- list(AdexaT1=AdexaT1,
AdexaT3=AdexaT3)
grid.newpage()
grid.draw(venn.diagram(x=lapply(res.list,"[[","all"),
filename=NULL,category.names=names(res.list),fill=pal[1:2]))
grid.newpage()
grid.draw(venn.diagram(x=lapply(res.list,"[[","up"),
filename=NULL,category.names=names(res.list),fill=pal[1:2]))
grid.newpage()
grid.draw(venn.diagram(x=lapply(res.list,"[[","dn"),
filename=NULL,category.names=names(res.list),fill=pal[1:2]))
ddsLeaf <- dds[,dds$TISSUE=="LEAF"]
ddsLeaf <- cbind(ddsLeaf,ddsLeaf[,ddsLeaf$TREATMENT=="Na"])
ddsLeaf$TREATMENT[ddsLeaf$TREATMENT=="Na"] <- rep(c("Mock","Dexa"),each=3)
ddsLeaf$TREATMENT <- droplevels(ddsLeaf$TREATMENT)
design(ddsLeaf) <- ~ TIME * TREATMENT
ddsLeaf <- DESeq(ddsLeaf)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
Update the vst for convenience
vsdLeaf <- varianceStabilizingTransformation(ddsLeaf,blind=FALSE)
vstLeaf <- assay(vsdLeaf)
vstLeaf <- vstLeaf - min(vstLeaf)
plotDispEsts(ddsLeaf)
Check the different contrasts
resultsNames(ddsLeaf)
## [1] "Intercept" "TIME_T1_vs_T0" "TIME_T3_vs_T0"
## [4] "TREATMENT_Dexa_vs_Mock" "TIMET1.TREATMENTDexa" "TIMET3.TREATMENTDexa"
LdexaT1 <- extract_results(ddsLeaf,vstLeaf,
contrast = "TIMET1.TREATMENTDexa",
default_prefix = "Salmon-Tx_Leaf_Dexa-vs-Mock_T1_",
labels = ddsLeaf$TREATMENT,
sample_sel = ddsLeaf$TIME=="T1")
## The independent filtering cutoff is 3.452, removing 39.72963% of the data
## The independent filtering cutoff is 3.452, removing 39.72963% of the data
## The independent filtering maximises for 45.7048 % of the data, corresponding to a base mean expression of 2.95337 (library-size normalised read)
## There are 38 transcripts that are DE with the following parameters: FDR <= 0.01, |log2FC| >= 0.5, base mean expression > 0
LdexaT3 <- extract_results(ddsLeaf,vstLeaf,
contrast = "TIMET3.TREATMENTDexa",
default_prefix = "Salmon-Tx_Leaf_Dexa-vs-Mock_T3_",
labels = ddsLeaf$TREATMENT,
sample_sel = ddsLeaf$TIME=="T3")
## The independent filtering cutoff is 1.74247, removing 35.24825% of the data
## The independent filtering cutoff is 1.74247, removing 35.24825% of the data
## The independent filtering maximises for 54.66757 % of the data, corresponding to a base mean expression of 15.98314 (library-size normalised read)
## There are 34 transcripts that are DE with the following parameters: FDR <= 0.01, |log2FC| >= 0.5, base mean expression > 0
res.list <- list(LdexaT1=LdexaT1,
LdexaT3=LdexaT3)
grid.newpage()
grid.draw(venn.diagram(x=lapply(res.list,"[[","all"),
filename=NULL,category.names=names(res.list),fill=pal[1:2]))
grid.newpage()
grid.draw(venn.diagram(x=lapply(res.list,"[[","up"),
filename=NULL,category.names=names(res.list),fill=pal[1:2]))
grid.newpage()
grid.draw(venn.diagram(x=lapply(res.list,"[[","dn"),
filename=NULL,category.names=names(res.list),fill=pal[1:2]))
res.list <- list(AdexaT1=AdexaT1,
AdexaT3=AdexaT3,
LdexaT1=LdexaT1,
LdexaT3=LdexaT3)
grid.newpage()
grid.draw(venn.diagram(x=lapply(res.list,"[[","all"),
filename=NULL,category.names=names(res.list),fill=pal[1:4]))
grid.newpage()
grid.draw(venn.diagram(x=lapply(res.list,"[[","up"),
filename=NULL,category.names=names(res.list),fill=pal[1:4]))
grid.newpage()
grid.draw(venn.diagram(x=lapply(res.list,"[[","dn"),
filename=NULL,category.names=names(res.list),fill=pal[1:4]))
background <- unique(sub("\\.[0-9]+","",rownames(vst)[featureSelect(vst,dds$CONDITION,exp=0.2)]))
stopifnot(all(sub("\\.[0-9]+","",unlist(res.list)) %in% background))
enr.list <- lapply(res.list,function(r){
lapply(lapply(r,sub,pattern="\\.[0-9]+",replacement=""),
gopher,background=background,task="go",url="athaliana")
})
## Loading required package: jsonlite
##
## Attaching package: 'jsonlite'
## The following object is masked from 'package:purrr':
##
## flatten
## No enrichments found in task: go
## No enrichments found in task: go
## No enrichments found in task: go
## No enrichments found in task: go
## No enrichments found in task: go
## No enrichments found in task: go
dev.null <- lapply(names(enr.list),function(n){
r <- enr.list[[n]]
if(! is.null (r$all$go)){
write_tsv(r$all$go,path=file.path(file.path(here("data/analysis/DE",
paste0(n,"Salmon-all-DE-transcripts_GO-enrichment.tsv")))))
write_tsv(r$all$go[,c("id","padj")],path=file.path(file.path(here("data/analysis/DE",
paste0(n,"Salmon-all-DE-transcripts_GO-enrichment_for-REVIGO.tsv")))))
}
if(! is.null (r$up$go)){
write_tsv(r$up$go,path=file.path(file.path(here("data/analysis/DE",
paste0(n,"Salmon-up-DE-transcripts_GO-enrichment.tsv")))))
write_tsv(r$up$go[,c("id","padj")],path=file.path(file.path(here("data/analysis/DE",
paste0(n,"Salmon-up-DE-transcripts_GO-enrichment_for-REVIGO.tsv")))))
}
if(! is.null (r$dn$go)){
write_tsv(r$dn$go,path=file.path(file.path(here("data/analysis/DE",
paste0(n,"Salmon-down-DE-transcripts_GO-enrichment.tsv")))))
write_tsv(r$dn$go[,c("id","padj")],path=file.path(file.path(here("data/analysis/DE",
paste0(n,"Salmon-down-DE-transcripts_GO-enrichment_for-REVIGO.tsv")))))
}
})
## R version 3.6.2 (2019-12-12)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.4 LTS
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] grid parallel stats4 stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] jsonlite_1.6.1 LSD_4.0-0
## [3] limma_3.42.2 vsn_3.54.0
## [5] VennDiagram_1.6.20 futile.logger_1.4.3
## [7] forcats_0.4.0 stringr_1.4.0
## [9] dplyr_0.8.4 purrr_0.3.3
## [11] readr_1.3.1 tidyr_1.0.2
## [13] tibble_2.1.3 tidyverse_1.3.0
## [15] RColorBrewer_1.1-2 hyperSpec_0.99-20200213
## [17] xml2_1.2.2 ggplot2_3.2.1
## [19] lattice_0.20-38 here_0.1
## [21] gplots_3.0.1.2 DESeq2_1.26.0
## [23] SummarizedExperiment_1.16.1 DelayedArray_0.12.2
## [25] BiocParallel_1.20.1 matrixStats_0.55.0
## [27] Biobase_2.46.0 GenomicRanges_1.38.0
## [29] GenomeInfoDb_1.22.0 IRanges_2.20.2
## [31] S4Vectors_0.24.3 BiocGenerics_0.32.0
## [33] data.table_1.12.8
##
## loaded via a namespace (and not attached):
## [1] colorspace_1.4-1 rprojroot_1.3-2 htmlTable_1.13.3
## [4] XVector_0.26.0 fs_1.3.1 base64enc_0.1-3
## [7] rstudioapi_0.11 hexbin_1.28.1 farver_2.0.3
## [10] affyio_1.56.0 bit64_0.9-7 fansi_0.4.1
## [13] AnnotationDbi_1.48.0 lubridate_1.7.4 splines_3.6.2
## [16] geneplotter_1.64.0 knitr_1.28 Formula_1.2-3
## [19] broom_0.5.4 annotate_1.64.0 cluster_2.1.0
## [22] dbplyr_1.4.2 png_0.1-7 BiocManager_1.30.10
## [25] compiler_3.6.2 httr_1.4.1 backports_1.1.5
## [28] assertthat_0.2.1 Matrix_1.2-18 lazyeval_0.2.2
## [31] cli_2.0.1 formatR_1.7 acepack_1.4.1
## [34] htmltools_0.4.0 tools_3.6.2 affy_1.64.0
## [37] gtable_0.3.0 glue_1.3.1 GenomeInfoDbData_1.2.2
## [40] Rcpp_1.0.3 cellranger_1.1.0 vctrs_0.2.2
## [43] preprocessCore_1.48.0 gdata_2.18.0 nlme_3.1-144
## [46] xfun_0.12 rvest_0.3.5 testthat_2.3.1
## [49] lifecycle_0.1.0 gtools_3.8.1 XML_3.99-0.3
## [52] zlibbioc_1.32.0 scales_1.1.0 hms_0.5.3
## [55] lambda.r_1.2.4 curl_4.3 yaml_2.2.1
## [58] memoise_1.1.0 gridExtra_2.3 rpart_4.1-15
## [61] latticeExtra_0.6-29 stringi_1.4.5 RSQLite_2.2.0
## [64] highr_0.8 genefilter_1.68.0 checkmate_2.0.0
## [67] caTools_1.18.0 rlang_0.4.4 pkgconfig_2.0.3
## [70] bitops_1.0-6 evaluate_0.14 labeling_0.3
## [73] htmlwidgets_1.5.1 bit_1.1-15.2 tidyselect_1.0.0
## [76] magrittr_1.5 R6_2.4.1 generics_0.0.2
## [79] Hmisc_4.3-1 DBI_1.1.0 pillar_1.4.3
## [82] haven_2.2.0 foreign_0.8-75 withr_2.1.2
## [85] survival_3.1-8 RCurl_1.98-1.1 nnet_7.3-12
## [88] modelr_0.1.5 crayon_1.3.4 futile.options_1.0.1
## [91] KernSmooth_2.23-16 rmarkdown_2.1 jpeg_0.1-8.1
## [94] locfit_1.5-9.1 readxl_1.3.1 blob_1.2.1
## [97] reprex_0.3.0 digest_0.6.24 xtable_1.8-4
## [100] munsell_0.5.0